GNBF5010 Homework 3¶

Student id: 1155228903

Question1¶

(Calculate the proportion of SNPs that are transitions, as well as the number of SNPs in each region)

In [6]:
import pandas as pd

# Class for SNP
class SNP:
    def __init__(self, chrname, pos, snpid, ref_allele, alt_allele):
        self.chrname = chrname
        self.pos = pos
        self.snpid = snpid
        self.ref_allele = ref_allele
        self.alt_allele = alt_allele

    # Check if SNP is Transition
    def is_transition(self):
        ref_alt = (self.ref_allele + self.alt_allele).upper()
        return ref_alt in ["AG", "GA", "CT", "TC"]

    # Check if SNP is Transversion
    def is_transversion(self):
        return not self.is_transition()

# Class for Chromosome
class Chromosome:
    def __init__(self, chrname):
        self.chrname = chrname
        self.locations_to_snps = dict()

    # Add SNP to Chromosome
    def add_snp(self, snp):
        self.locations_to_snps[snp.pos] = snp

    # Count Transitions
    def count_transitions(self):
        count = 0
        for snp in self.locations_to_snps.values():
            if snp.is_transition():
                count += 1
        return count

    # Count Transversions
    def count_transversions(self):
        return len(self.locations_to_snps) - self.count_transitions()

    # Count SNPs in Region (total and transitions)
    def snps_in_region(self, region_start, region_end):
        total = 0
        transitions = 0
        for location in self.locations_to_snps:
            if region_start <= location <= region_end:
                total += 1
                if self.locations_to_snps[location].is_transition():
                    transitions += 1
        return total, transitions

def main():
    # Read in VCF File
    chrnames_to_chrs = dict()
    file_name = "trio.sample.vcf"       # Change this to the name of the VCF file
    with open(file_name, "r") as fh:
        for line in fh:
            if not line.startswith("#"):
                chrname, pos, snpid, ref, alt = line.strip().split("\t")[:5]
                pos = int(pos)
                if chrname not in chrnames_to_chrs:
                    chrnames_to_chrs[chrname] = Chromosome(chrname)
                if ref != alt:
                    newsnp = SNP(chrname, pos, snpid, ref, alt)
                    chrnames_to_chrs[chrname].add_snp(newsnp)

    # Save Results to File
    with open("transitions.txt", "w") as out_file:
        out_file.write("chromosome\tregion\tpercent_transitions\tnum_snps\n")
        region_size = 1000000
        for chrname in chrnames_to_chrs:
            chr_obj = chrnames_to_chrs[chrname]
            last_snp_position = max(chr_obj.locations_to_snps.keys())
            region_start = 1
            while region_start <= last_snp_position:
                region_end = region_start + region_size - 1
                num_snps, num_transitions = chr_obj.snps_in_region(region_start, region_end)
                if num_snps > 0:
                    percent_transitions = num_transitions / num_snps
                else:
                    percent_transitions = 0
                out_file.write(f"{chrname}\t{region_start}..{region_end}\t{percent_transitions:.3f}\t{num_snps}\n")
                region_start += region_size

if __name__ == "__main__":
    main()
    # Display Results
    df = pd.read_csv("transitions.txt", sep="\t")
    display(df)
chromosome region percent_transitions num_snps
0 1 1..1000000 0.722 18
1 1 1000001..2000000 0.794 34
2 1 2000001..3000000 0.681 72
3 1 3000001..4000000 0.662 65
4 1 4000001..5000000 0.747 83
... ... ... ... ...
3028 X 150000001..151000000 0.771 35
3029 X 151000001..152000000 0.737 38
3030 X 152000001..153000000 0.676 37
3031 X 153000001..154000000 0.750 4
3032 X 154000001..155000000 0.533 15

3033 rows × 4 columns

Question2¶

(Simulate the evolutionary process)

In [7]:
import random

# Class for Bug
class Bug:
    def __init__(self):
        self.genome = ''.join(random.choice("ACGT") for _ in range(100))    # Randomly generate genome
        self.fitness = self.get_fitness()

    # Calculate fitness of Bug
    def get_fitness(self):
        num_Gs = self.genome.count('G')
        num_Cs = self.genome.count('C')
        if 'AAAAA' in self.genome:
            return num_Gs + num_Cs + 5
        else:
            return num_Gs + num_Cs

    # Mutate a random base in the genome
    def mutate_random_base(self):
        index = random.randint(0, 99)
        new_base = random.choice("ACGT")
        self.genome = self.genome[:index] + new_base + self.genome[index+1:]
        self.fitness = self.get_fitness()

    # Set a base at a specific index in the genome
    def set_base(self, index, base):
        if 0 <= index < 100 and base in "ACGT":
            self.genome = self.genome[:index] + base + self.genome[index+1:]
            self.fitness = self.get_fitness()
 
    # Compare Bugs based on fitness (facilitate the sorting of the bug list)
    def __lt__(self, other):
        return self.fitness < other.fitness

# Class for Population
class Population:
    def __init__(self):
        self.bug_list = [Bug() for _ in range(50)]  # Create a population of 50 bugs

    # Create offspring by mutating random base in genome
    def create_offspring(self):
        new_pop = []
        for bug in self.bug_list:
            newbug = Bug()
            newbug.genome = bug.genome
            newbug.mutate_random_base()
            new_pop.append(bug)
            new_pop.append(newbug)
        self.bug_list = new_pop

    # Reduces self.bug_list to the top 50% of bug objects by fitness
    def cull(self):
        self.bug_list.sort()
        self.bug_list = self.bug_list[len(self.bug_list)//2:]

    # Get mean fitness of population
    def get_mean_fitness(self):
        return sum(bug.fitness for bug in self.bug_list) / len(self.bug_list)

def main():
    p = Population()
    print("Fitness of population during evolutionary progress")
    print("(Assuming fitness = num_Gs + num_Cs (+ 5, if AAAAA present in genome)):")
    # Run for 20 generations
    for _ in range(20):
        p.create_offspring()
        p.cull()
        print(p.get_mean_fitness())

if __name__ == "__main__":
    main()
Fitness of population during evolutionary progress
(Assuming fitness = num_Gs + num_Cs (+ 5, if AAAAA present in genome)):
53.8
56.32
57.48
58.48
59.34
60.18
60.58
61.18
61.64
62.26
62.56
63.1
63.42
63.8
64.26
64.66
65.24
65.56
65.98
66.3

Question3¶

(Regular Expressions)

  1. Match a word that contains 4 or more consecutive vowels (a, e, i, o, u).
    • Regular Expression for pattern 1: \b\w*([aeiou]{4,})\w*\b
    • The code for testing is below:
In [8]:
import re

pattern1 = r'\b\w*([aeiou]{4,})\w*\b'
test_strings1 = ["hello", "aeie", "daoioaidas", "aebee", "auu"]
for test in test_strings1:
    if re.search(pattern1, test):
        print(f"> '{test}' matches pattern 1")
    else:
        print(f"> '{test}' does not match pattern 1")
> 'hello' does not match pattern 1
> 'aeie' matches pattern 1
> 'daoioaidas' matches pattern 1
> 'aebee' does not match pattern 1
> 'auu' does not match pattern 1
  1. Match a string with at least 12 characters with “z” as the last one.
    • Regular Expression for pattern 2: .{11}z$
    • The code for testing is below:
In [9]:
import re

pattern2 = r'.{11}z$'
test_strings2 = ["hello", "thisisaz", "thisstringhasmorethan11charactersandendswithz", "sjkfbajkhnfjuvaehjfbjkasbhjfvhjkawbgfjksbkjcbxsjkgfhkjagbjkz"]
for test in test_strings2:
    if re.search(pattern2, test):
        print(f"> '{test}' matches pattern 2")
    else:
        print(f"> '{test}' does not match pattern 2")
> 'hello' does not match pattern 2
> 'thisisaz' does not match pattern 2
> 'thisstringhasmorethan11charactersandendswithz' matches pattern 2
> 'sjkfbajkhnfjuvaehjfbjkasbhjfvhjkawbgfjksbkjcbxsjkgfhkjagbjkz' matches pattern 2
  1. Match a number that is only composed of even digits, including 0. But don't allow 0 to be the first digit.
    • Regular Expression for pattern 3: ^(?!0)[02468][02468]*$
    • The code for testing is below:
In [10]:
import re

pattern3 = r'^(?!0)[02468][02468]*$'
test_strings3 = ["248", "4200", "6", "0", "020", "5", "123"]
for test in test_strings3:
    if re.search(pattern3, test):
        print(f"> '{test}' matches pattern 3")
    else:
        print(f"> '{test}' does not match pattern 3")
> '248' matches pattern 3
> '4200' matches pattern 3
> '6' matches pattern 3
> '0' does not match pattern 3
> '020' does not match pattern 3
> '5' does not match pattern 3
> '123' does not match pattern 3
  1. Match an RNA sequence that begins with "AUG" and ends with either of "UAA","UAG", or "UGA".
    • Regular Expression for pattern 4: AUG[ACGU]*(?:UAA|UAG|UGA)$
    • The code for testing is below:
In [11]:
import re

pattern4 = r'AUG[ACGU]*(?:UAA|UAG|UGA)$'
test_strings4 = ["AUGUUUUUAA", "AUGUUUUUGA", "AUGUUUUUAG", "AUGUUUUUUU", "AUGUUTUUAA"]
for test in test_strings4:
    if re.search(pattern4, test):
        print(f"> '{test}' matches pattern 4")
    else:
        print(f"> '{test}' does not match pattern 4")
> 'AUGUUUUUAA' matches pattern 4
> 'AUGUUUUUGA' matches pattern 4
> 'AUGUUUUUAG' matches pattern 4
> 'AUGUUUUUUU' does not match pattern 4
> 'AUGUUTUUAA' does not match pattern 4